In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from math import sqrt
def smape(A, F):
return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))
def vape(y_true, y_pred):
return np.mean(np.abs(y_true - y_pred))/np.mean(np.abs(y_true))
In [2]:
clicks_df = pd.read_csv("click_for_day.csv").drop(["Unnamed: 0","pred","date_obj"], axis=1)
In [3]:
clicks_df['date'] = pd.to_datetime(clicks_df['date']).dt.date
In [4]:
clicks_df.head()
Out[4]:
| date | month_dt | day_in_month_dt | day_of_week_dt | week_in_year_dt | is_holiday | day_in_year_dt | clicks | bidder_impact | new_befaviour | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2018-01-01 | 1 | 1 | 0 | 1 | True | 1 | 112 | False | False |
| 1 | 2018-01-02 | 1 | 2 | 1 | 1 | False | 2 | 216 | False | False |
| 2 | 2018-01-03 | 1 | 3 | 2 | 1 | False | 3 | 167 | False | False |
| 3 | 2018-01-04 | 1 | 4 | 3 | 1 | False | 4 | 175 | False | False |
| 4 | 2018-01-05 | 1 | 5 | 4 | 1 | False | 5 | 158 | False | False |
In [5]:
fig = px.line(clicks_df, x="date", y="clicks", title='Count of clicks for a day')
fig.show()
In [5]:
weather_df = pd.read_csv("weather_history.csv")
In [6]:
weather_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 43056 entries, 0 to 43055 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 city 43056 non-null object 1 region 43056 non-null object 2 country 43056 non-null object 3 latitude 0 non-null float64 4 longitude 0 non-null float64 5 datetime 43056 non-null object 6 maxtempC 43056 non-null int64 7 mintempC 43056 non-null int64 8 avgtempC 43056 non-null int64 9 totalSnow_cm 43056 non-null float64 10 sunHour 43056 non-null float64 11 tempC 43056 non-null int64 12 windspeedKmph 43056 non-null int64 13 winddirDegree 43056 non-null int64 14 winddir16Point 43056 non-null object 15 weatherCode 43056 non-null int64 16 weatherDesc 0 non-null float64 17 precipMM 43056 non-null float64 18 humidity 43056 non-null int64 19 visibility 43056 non-null int64 20 pressure 43056 non-null int64 21 cloudcover 43056 non-null int64 22 HeatIndexC 43056 non-null int64 23 WindChillC 43056 non-null int64 24 WindGustKmph 43056 non-null int64 25 FeelsLikeC 43056 non-null int64 26 uvIndex 43056 non-null int64 dtypes: float64(6), int64(16), object(5) memory usage: 8.9+ MB
In [7]:
weather_df.describe()
Out[7]:
| latitude | longitude | maxtempC | mintempC | avgtempC | totalSnow_cm | sunHour | tempC | windspeedKmph | winddirDegree | ... | precipMM | humidity | visibility | pressure | cloudcover | HeatIndexC | WindChillC | WindGustKmph | FeelsLikeC | uvIndex | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 0.0 | 0.0 | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 | ... | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 | 43056.000000 |
| mean | NaN | NaN | 9.016211 | -0.179487 | 5.096386 | 0.353349 | 10.226709 | 4.158305 | 11.882130 | 213.241708 | ... | 0.053881 | 69.241662 | 9.033422 | 1015.371191 | 41.293107 | 4.285628 | 1.504645 | 19.061014 | 1.530147 | 1.978632 |
| std | NaN | NaN | 12.763126 | 10.719672 | 11.858362 | 1.333326 | 3.939936 | 12.005125 | 6.018685 | 91.218255 | ... | 0.286275 | 18.626919 | 2.275441 | 9.429675 | 34.606166 | 12.108482 | 14.055261 | 9.509353 | 14.093446 | 1.574934 |
| min | NaN | NaN | -28.000000 | -34.000000 | -31.000000 | 0.000000 | 2.000000 | -34.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 8.000000 | 0.000000 | 977.000000 | 0.000000 | -34.000000 | -44.000000 | 0.000000 | -44.000000 | 1.000000 |
| 25% | NaN | NaN | 0.000000 | -7.000000 | -3.000000 | 0.000000 | 6.900000 | -4.000000 | 8.000000 | 148.000000 | ... | 0.000000 | 56.000000 | 10.000000 | 1009.000000 | 10.000000 | -4.000000 | -8.000000 | 12.000000 | -8.000000 | 1.000000 |
| 50% | NaN | NaN | 9.000000 | 1.000000 | 6.000000 | 0.000000 | 10.300000 | 5.000000 | 11.000000 | 233.000000 | ... | 0.000000 | 72.000000 | 10.000000 | 1015.000000 | 31.000000 | 5.000000 | 2.000000 | 18.000000 | 2.000000 | 1.000000 |
| 75% | NaN | NaN | 19.000000 | 9.000000 | 15.000000 | 0.000000 | 13.500000 | 13.000000 | 15.000000 | 282.000000 | ... | 0.000000 | 84.000000 | 10.000000 | 1021.000000 | 73.000000 | 13.000000 | 12.000000 | 24.000000 | 12.000000 | 2.000000 |
| max | NaN | NaN | 38.000000 | 21.000000 | 32.000000 | 18.100000 | 16.300000 | 38.000000 | 51.000000 | 360.000000 | ... | 12.100000 | 100.000000 | 10.000000 | 1061.000000 | 100.000000 | 38.000000 | 38.000000 | 94.000000 | 38.000000 | 9.000000 |
8 rows × 22 columns
In [8]:
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])
In [9]:
weather_df['date'] = weather_df['datetime'].dt.date
In [10]:
weather_df.head()
Out[10]:
| city | region | country | latitude | longitude | datetime | maxtempC | mintempC | avgtempC | totalSnow_cm | ... | humidity | visibility | pressure | cloudcover | HeatIndexC | WindChillC | WindGustKmph | FeelsLikeC | uvIndex | date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Calgary | Alberta | Canada | NaN | NaN | 2018-04-01 00:00:00 | -8 | -14 | -10 | 0.1 | ... | 87 | 10 | 1029 | 14 | -13 | -17 | 13 | -17 | 1 | 2018-04-01 |
| 1 | Calgary | Alberta | Canada | NaN | NaN | 2018-04-01 01:00:00 | -8 | -14 | -10 | 0.1 | ... | 86 | 10 | 1028 | 25 | -13 | -16 | 11 | -16 | 1 | 2018-04-01 |
| 2 | Calgary | Alberta | Canada | NaN | NaN | 2018-04-01 02:00:00 | -8 | -14 | -10 | 0.1 | ... | 85 | 10 | 1028 | 36 | -13 | -14 | 8 | -14 | 1 | 2018-04-01 |
| 3 | Calgary | Alberta | Canada | NaN | NaN | 2018-04-01 03:00:00 | -8 | -14 | -10 | 0.1 | ... | 84 | 10 | 1027 | 47 | -13 | -13 | 6 | -13 | 1 | 2018-04-01 |
| 4 | Calgary | Alberta | Canada | NaN | NaN | 2018-04-01 04:00:00 | -8 | -14 | -10 | 0.1 | ... | 84 | 10 | 1026 | 62 | -12 | -12 | 6 | -12 | 1 | 2018-04-01 |
5 rows × 28 columns
In [11]:
weather_df['city'].value_counts()
Out[11]:
city Calgary 43056 Name: count, dtype: int64
In [12]:
weather_df.columns
Out[12]:
Index(['city', 'region', 'country', 'latitude', 'longitude', 'datetime',
'maxtempC', 'mintempC', 'avgtempC', 'totalSnow_cm', 'sunHour', 'tempC',
'windspeedKmph', 'winddirDegree', 'winddir16Point', 'weatherCode',
'weatherDesc', 'precipMM', 'humidity', 'visibility', 'pressure',
'cloudcover', 'HeatIndexC', 'WindChillC', 'WindGustKmph', 'FeelsLikeC',
'uvIndex', 'date'],
dtype='object')
In [13]:
weather_df.drop(['city', 'region', 'country', 'latitude', 'longitude', 'datetime',"winddir16Point","weatherDesc"], axis=1, inplace=True)
In [14]:
weather_df_by_date = weather_df.groupby(['date'], as_index=False).mean()
In [15]:
weather_df
Out[15]:
| maxtempC | mintempC | avgtempC | totalSnow_cm | sunHour | tempC | windspeedKmph | winddirDegree | weatherCode | precipMM | humidity | visibility | pressure | cloudcover | HeatIndexC | WindChillC | WindGustKmph | FeelsLikeC | uvIndex | date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -8 | -14 | -10 | 0.1 | 6.5 | -14 | 6 | 219 | 113 | 0.0 | 87 | 10 | 1029 | 14 | -13 | -17 | 13 | -17 | 1 | 2018-04-01 |
| 1 | -8 | -14 | -10 | 0.1 | 6.5 | -14 | 5 | 221 | 113 | 0.0 | 86 | 10 | 1028 | 25 | -13 | -16 | 11 | -16 | 1 | 2018-04-01 |
| 2 | -8 | -14 | -10 | 0.1 | 6.5 | -14 | 4 | 222 | 116 | 0.0 | 85 | 10 | 1028 | 36 | -13 | -14 | 8 | -14 | 1 | 2018-04-01 |
| 3 | -8 | -14 | -10 | 0.1 | 6.5 | -13 | 3 | 223 | 116 | 0.0 | 84 | 10 | 1027 | 47 | -13 | -13 | 6 | -13 | 1 | 2018-04-01 |
| 4 | -8 | -14 | -10 | 0.1 | 6.5 | -13 | 3 | 173 | 116 | 0.0 | 84 | 10 | 1026 | 62 | -12 | -12 | 6 | -12 | 1 | 2018-04-01 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 43051 | -4 | -19 | -11 | 0.0 | 10.0 | -11 | 7 | 130 | 260 | 0.0 | 97 | 0 | 1007 | 66 | -11 | -16 | 14 | -16 | 1 | 2023-02-27 |
| 43052 | -4 | -19 | -11 | 0.0 | 10.0 | -16 | 4 | 156 | 260 | 0.0 | 100 | 0 | 1009 | 95 | -16 | -19 | 8 | -19 | 1 | 2023-02-27 |
| 43053 | -4 | -19 | -11 | 0.0 | 10.0 | -17 | 3 | 200 | 260 | 0.0 | 100 | 0 | 1011 | 100 | -17 | -20 | 7 | -20 | 1 | 2023-02-27 |
| 43054 | -4 | -19 | -11 | 0.0 | 10.0 | -18 | 4 | 210 | 260 | 0.0 | 100 | 0 | 1012 | 100 | -18 | -21 | 8 | -21 | 1 | 2023-02-27 |
| 43055 | -4 | -19 | -11 | 0.0 | 10.0 | -18 | 4 | 240 | 260 | 0.0 | 100 | 0 | 1012 | 100 | -18 | -22 | 9 | -22 | 1 | 2023-02-27 |
43056 rows × 20 columns
In [16]:
df = pd.merge(clicks_df, weather_df_by_date, on='date', how='left', indicator=True)
In [17]:
df['_merge'].value_counts()
Out[17]:
_merge both 1758 left_only 89 right_only 0 Name: count, dtype: int64
In [18]:
df.dropna(inplace=True)
In [19]:
df['_merge'].value_counts()
Out[19]:
_merge both 1758 left_only 0 right_only 0 Name: count, dtype: int64
In [20]:
from ydata_profiling import ProfileReport
In [21]:
df.drop(['date',"_merge"], axis=1).corr()
Out[21]:
| month_dt | day_in_month_dt | day_of_week_dt | week_in_year_dt | is_holiday | day_in_year_dt | clicks | bidder_impact | new_befaviour | maxtempC | ... | precipMM | humidity | visibility | pressure | cloudcover | HeatIndexC | WindChillC | WindGustKmph | FeelsLikeC | uvIndex | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| month_dt | 1.000000 | 0.008001 | -0.000809 | 0.967513 | 0.019695 | 0.996412 | -0.190993 | 0.044454 | 0.150674 | 0.148657 | ... | -0.010382 | -0.090976 | 0.043537 | -0.069565 | -0.144789 | 0.145129 | 0.149505 | -0.023335 | 0.149214 | 0.132562 |
| day_in_month_dt | 0.008001 | 1.000000 | 0.001159 | 0.063113 | -0.065674 | 0.092327 | 0.002772 | 0.000098 | -0.003636 | 0.023674 | ... | -0.009181 | -0.010156 | 0.037998 | -0.001840 | -0.036714 | 0.026638 | 0.025228 | 0.031129 | 0.025632 | 0.015899 |
| day_of_week_dt | -0.000809 | 0.001159 | 1.000000 | 0.002069 | -0.148241 | -0.000705 | -0.318300 | -0.001786 | 0.000098 | 0.006618 | ... | 0.045648 | 0.005289 | -0.011810 | -0.025675 | 0.019395 | 0.008815 | 0.005601 | 0.049326 | 0.005504 | -0.004704 |
| week_in_year_dt | 0.967513 | 0.063113 | 0.002069 | 1.000000 | 0.048629 | 0.968883 | -0.191577 | 0.048283 | 0.147267 | 0.141697 | ... | -0.012141 | -0.084251 | 0.048870 | -0.078749 | -0.150955 | 0.137343 | 0.140783 | -0.005155 | 0.140535 | 0.126590 |
| is_holiday | 0.019695 | -0.065674 | -0.148241 | 0.048629 | 1.000000 | 0.014155 | -0.026664 | 0.019173 | 0.005456 | -0.057459 | ... | 0.048889 | 0.041050 | -0.039205 | 0.004391 | 0.047410 | -0.057342 | -0.056484 | -0.024152 | -0.056412 | -0.035651 |
| day_in_year_dt | 0.996412 | 0.092327 | -0.000705 | 0.968883 | 0.014155 | 1.000000 | -0.190320 | 0.044234 | 0.149554 | 0.147168 | ... | -0.011444 | -0.089604 | 0.045515 | -0.068213 | -0.146330 | 0.143854 | 0.148062 | -0.019459 | 0.147806 | 0.131018 |
| clicks | -0.190993 | 0.002772 | -0.318300 | -0.191577 | -0.026664 | -0.190320 | 1.000000 | -0.210558 | -0.443571 | 0.021499 | ... | 0.059416 | -0.052855 | 0.013200 | -0.015246 | 0.056980 | 0.035576 | 0.035221 | -0.086451 | 0.035937 | 0.042409 |
| bidder_impact | 0.044454 | 0.000098 | -0.001786 | 0.048283 | 0.019173 | 0.044234 | -0.210558 | 1.000000 | 0.417844 | -0.060546 | ... | -0.136919 | -0.017700 | -0.048229 | -0.021034 | 0.048378 | -0.094595 | -0.104631 | 0.108024 | -0.104525 | -0.057091 |
| new_befaviour | 0.150674 | -0.003636 | 0.000098 | 0.147267 | 0.005456 | 0.149554 | -0.443571 | 0.417844 | 1.000000 | 0.070994 | ... | -0.082573 | 0.053911 | -0.049675 | 0.030904 | 0.020307 | 0.034494 | 0.031208 | -0.016830 | 0.031170 | 0.092718 |
| maxtempC | 0.148657 | 0.023674 | 0.006618 | 0.141697 | -0.057459 | 0.147168 | 0.021499 | -0.060546 | 0.070994 | 1.000000 | ... | 0.060175 | -0.673511 | 0.526831 | -0.492112 | -0.526465 | 0.987673 | 0.986609 | -0.084348 | 0.986489 | 0.904912 |
| mintempC | 0.159483 | 0.029963 | 0.012383 | 0.150150 | -0.058273 | 0.158466 | 0.034269 | -0.112628 | 0.009503 | 0.956398 | ... | 0.135400 | -0.599872 | 0.481568 | -0.539485 | -0.410667 | 0.985379 | 0.983360 | -0.048927 | 0.983301 | 0.823329 |
| avgtempC | 0.147422 | 0.024553 | 0.007521 | 0.139544 | -0.056675 | 0.145972 | 0.028318 | -0.086849 | 0.046233 | 0.992044 | ... | 0.092302 | -0.647258 | 0.506180 | -0.515709 | -0.478202 | 0.998185 | 0.996153 | -0.069044 | 0.996077 | 0.884962 |
| totalSnow_cm | -0.024360 | -0.020358 | 0.040147 | -0.028003 | 0.002067 | -0.025670 | 0.016985 | 0.021353 | -0.071795 | -0.239932 | ... | 0.275356 | 0.297425 | -0.538219 | 0.063721 | 0.371428 | -0.210476 | -0.214728 | -0.037912 | -0.214724 | -0.288111 |
| sunHour | -0.091664 | 0.006731 | -0.011023 | -0.098925 | -0.057076 | -0.094303 | 0.112869 | -0.119186 | -0.022630 | 0.776056 | ... | 0.095189 | -0.496166 | 0.455507 | -0.284653 | -0.483217 | 0.764808 | 0.766930 | -0.148582 | 0.767750 | 0.780169 |
| tempC | 0.147565 | 0.026401 | 0.009094 | 0.139641 | -0.057624 | 0.146253 | 0.031268 | -0.094212 | 0.036373 | 0.987946 | ... | 0.104282 | -0.639014 | 0.500427 | -0.522312 | -0.464384 | 0.999744 | 0.997478 | -0.064509 | 0.997429 | 0.875601 |
| windspeedKmph | -0.064031 | 0.015223 | 0.048207 | -0.053884 | -0.027168 | -0.062482 | 0.019540 | 0.078734 | -0.046249 | 0.001629 | ... | 0.172958 | -0.073503 | -0.018422 | -0.185487 | 0.106291 | 0.040012 | -0.010843 | 0.889979 | -0.011115 | -0.085789 |
| winddirDegree | 0.055624 | 0.034317 | -0.007678 | 0.066043 | -0.031765 | 0.059476 | -0.131122 | 0.064972 | 0.043331 | -0.122194 | ... | -0.057659 | -0.063403 | 0.116067 | -0.057722 | -0.152231 | -0.134560 | -0.150942 | 0.430677 | -0.151810 | -0.120092 |
| weatherCode | -0.143989 | -0.032262 | 0.022461 | -0.150448 | 0.074485 | -0.145634 | 0.061972 | -0.093022 | -0.106947 | -0.350675 | ... | 0.469313 | 0.574534 | -0.700698 | 0.078764 | 0.760827 | -0.290144 | -0.292403 | -0.057592 | -0.292213 | -0.401686 |
| precipMM | -0.010382 | -0.009181 | 0.045648 | -0.012141 | 0.048889 | -0.011444 | 0.059416 | -0.136919 | -0.082573 | 0.060175 | ... | 1.000000 | 0.224709 | -0.245414 | -0.132822 | 0.295468 | 0.105652 | 0.100826 | 0.078465 | 0.100930 | -0.022049 |
| humidity | -0.090976 | -0.010156 | 0.005289 | -0.084251 | 0.041050 | -0.089604 | -0.052855 | -0.017700 | 0.053911 | -0.673511 | ... | 0.224709 | 1.000000 | -0.608430 | 0.304383 | 0.674826 | -0.635855 | -0.629335 | -0.065004 | -0.627590 | -0.647113 |
| visibility | 0.043537 | 0.037998 | -0.011810 | 0.048870 | -0.039205 | 0.045515 | 0.013200 | -0.048229 | -0.049675 | 0.526831 | ... | -0.245414 | -0.608430 | 1.000000 | -0.263557 | -0.678692 | 0.499646 | 0.500905 | 0.073652 | 0.500531 | 0.490655 |
| pressure | -0.069565 | -0.001840 | -0.025675 | -0.078749 | 0.004391 | -0.068213 | -0.015246 | -0.021034 | 0.030904 | -0.492112 | ... | -0.132822 | 0.304383 | -0.263557 | 1.000000 | 0.157952 | -0.521640 | -0.517412 | -0.171832 | -0.516869 | -0.345647 |
| cloudcover | -0.144789 | -0.036714 | 0.019395 | -0.150955 | 0.047410 | -0.146330 | 0.056980 | 0.048378 | 0.020307 | -0.526465 | ... | 0.295468 | 0.674826 | -0.678692 | 0.157952 | 1.000000 | -0.463685 | -0.466139 | -0.043838 | -0.466146 | -0.593944 |
| HeatIndexC | 0.145129 | 0.026638 | 0.008815 | 0.137343 | -0.057342 | 0.143854 | 0.035576 | -0.094595 | 0.034494 | 0.987673 | ... | 0.105652 | -0.635855 | 0.499646 | -0.521640 | -0.463685 | 1.000000 | 0.997492 | -0.066581 | 0.997584 | 0.876820 |
| WindChillC | 0.149505 | 0.025228 | 0.005601 | 0.140783 | -0.056484 | 0.148062 | 0.035221 | -0.104631 | 0.031208 | 0.986609 | ... | 0.100826 | -0.629335 | 0.500905 | -0.517412 | -0.466139 | 0.997492 | 1.000000 | -0.115995 | 0.999950 | 0.876609 |
| WindGustKmph | -0.023335 | 0.031129 | 0.049326 | -0.005155 | -0.024152 | -0.019459 | -0.086451 | 0.108024 | -0.016830 | -0.084348 | ... | 0.078465 | -0.065004 | 0.073652 | -0.171832 | -0.043838 | -0.066581 | -0.115995 | 1.000000 | -0.116300 | -0.157505 |
| FeelsLikeC | 0.149214 | 0.025632 | 0.005504 | 0.140535 | -0.056412 | 0.147806 | 0.035937 | -0.104525 | 0.031170 | 0.986489 | ... | 0.100930 | -0.627590 | 0.500531 | -0.516869 | -0.466146 | 0.997584 | 0.999950 | -0.116300 | 1.000000 | 0.877241 |
| uvIndex | 0.132562 | 0.015899 | -0.004704 | 0.126590 | -0.035651 | 0.131018 | 0.042409 | -0.057091 | 0.092718 | 0.904912 | ... | -0.022049 | -0.647113 | 0.490655 | -0.345647 | -0.593944 | 0.876820 | 0.876609 | -0.157505 | 0.877241 | 1.000000 |
28 rows × 28 columns
In [63]:
profile = ProfileReport(df.drop(['date',"_merge"], axis=1),
title='Pandas Profiling Report')
In [64]:
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]